Global time series analysis¶

Load libraries¶

import warnings
from functools import partial

import covid_analysis.utils.paths as path
import janitor
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_flavor as pf
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
from plotly.offline import init_notebook_mode
from pmdarima.arima import ADFTest, auto_arima
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, adfuller, pacf

Set defaults for plots¶

# matplotlib
plt.style.use("seaborn-whitegrid")
plt.rcParams["figure.figsize"] = (10, 8)

# seaborn
sns.set_style("whitegrid")

# plotly
init_notebook_mode()
pio.templates.default = "plotly_white"
pd.options.plotting.backend = "plotly"

# Some plot warninigs
warnings.filterwarnings("ignore")

Utility functions¶

def series_corr_plot(series, plot_pacf=False, alpha=0.05, **kwargs):
    if plot_pacf:
        title = "Partial Autocorrelation (PACF)"
        corr_array = pacf(series.dropna(), alpha=alpha, **kwargs)
    else:
        title = "Autocorrelation (ACF)"
        corr_array = acf(series.dropna(), alpha=alpha, **kwargs)

    correlation = corr_array[0]
    n_lags = len(correlation)
    lags = np.arange(n_lags)
    lower_y = corr_array[1][:, 0] - corr_array[0]
    upper_y = corr_array[1][:, 1] - corr_array[0]

    fig = go.Figure()

    # Add vertical lines
    for i_lag in lags:
        fig.add_scatter(
            x=(i_lag, i_lag),
            y=(0, correlation[i_lag]),
            mode="lines",
            line_color="#3f3f3f",
            showlegend=False,
            hoverinfo='none'
        )

    # Add points
    fig.add_scatter(
        x=lags,
        y=correlation,
        name="Correlation",
        mode="markers",
        marker_color="#1f77b4",
        marker_size=12,
        showlegend=False
    )

    # Add confidence interval
    fig.add_scatter(
        name="Upper Bound",
        x=lags,
        y=upper_y,
        mode="lines",
        line_color="rgba(255,255,255,0)",
        showlegend=False,
    )

    fig.add_scatter(
        name="Lower Bound",
        x=lags,
        y=lower_y,
        mode="lines",
        line_color="rgba(255,255,255,0)",
        fillcolor="rgba(32, 146, 230,0.3)",
        fill="tonexty",
        showlegend=False
    )

    fig.update_yaxes(zerolinecolor='#000000')
    fig.update_layout(
        title=title,
        xaxis_title="Number of lags",
        hovermode="x unified"
    )

    return fig
def print_adfuller(adfuller_result):
    print(f"""
    ADF Statistic: {adfuller_result[0]}
    p-value: {adfuller_result[1]}
    Critical values {adfuller_result[4]}
    """)

Define input and output directory¶

input_dir = path.data_processed_dir()
output_dir = path.models_dir()

Load data¶

Confirmed and death time series¶

hopkins_tidy_cumulative_df = (
    pd.read_csv(
        filepath_or_buffer=input_dir.joinpath("hopkins_tidy_cumulative.csv")
    )
    .transform_column("date", pd.to_datetime)
)

hopkins_tidy_cumulative_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118755 entries, 0 to 118754
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   country    118755 non-null  object        
 1   date       118755 non-null  datetime64[ns]
 2   confirmed  118755 non-null  int64         
 3   deaths     118755 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 3.6+ MB

Vaccination¶

vaccination_tidy_cumulative_df = (
    pd.read_csv(
        filepath_or_buffer=input_dir.joinpath("vaccination_country_cumulative.csv")
    )
)

vaccination_tidy_cumulative_df.head(1)
country date doses_admin people_partially_vaccinated people_fully_vaccinated
0 Afghanistan 2021-02-22 0 0.0 0.0

Transform cumulative observations to daily changes in global level¶

hopkins_tidy_global_daily_df = (
    hopkins_tidy_cumulative_df
    .remove_columns("country")
    .groupby("date")
    .sum()
    .diff()
    .dropna()
    .reset_index()
)

hopkins_tidy_global_daily_df.head(1)
date confirmed deaths
0 2020-01-23 98.0 1.0

Pandemic behavior to date¶

(
    hopkins_tidy_global_daily_df
    .set_index("date")
    .pipe(
        lambda df: (
            df
            .join(
                df.rolling("7D").mean(),
                rsuffix="_mean"
            )
            .join(
                df.rolling("7D").std(),
                rsuffix="_std"
            )
        )
    )
    .reset_index()
    .pivot_longer(
        index="date",
        names_to="metric"
    )
    .assign(
        variable=lambda df: df.metric.str.extract(r"(confirmed|deaths)"),
        metric=lambda df: (
            df.metric.str.extract(r"(?:confirmed|deaths)_?(.+)")
            .fillna("Daily")
            .replace(
                {
                    "mean": "Rolling mean (7 days)",
                    "std": "Rolling std (7 days)"
                }
            )
        )
    )
    .pipe(
        lambda df: (
            px.line(
                df,
                x="date",
                y="value",
                color="metric",
                facet_row="variable",
                labels=dict(
                    date="Date",
                    value="Observations number",
                    metric="Trace",
                    variable="Variable"
                )
            )
            .update_yaxes(matches=None)
            .update_layout(
                legend=dict(
                    orientation="h",
                    yanchor="bottom",
                    y=1.02
                ),
                margin={
                    "r": 0.7,
                    "l": 0,
                },
                hovermode="x unified"
            )
        )
    )
)

Fatality rate¶

Cumulative¶

(
    hopkins_tidy_cumulative_df
    .groupby("date")
    .sum()
    .assign(
        fatality_rate=lambda df: (df.deaths / df.confirmed * 100).round(2)
    )
    .reset_index()
    .pipe(
        lambda df: (
            px.area(
                df,
                x="date",
                y="fatality_rate",
                labels=dict(
                    date="Date",
                    fatality_rate="Fatality rate"
                )
            )
            .update_layout(
                title="Cumulative",
                yaxis=dict(ticksuffix="%"),
                margin={
                    "r": 0.7,
                    "l": 0,
                },
            )
        )
    )
)
(
    hopkins_tidy_global_daily_df
    .assign(
        fatality_rate=lambda df: (df.deaths / df.confirmed * 100).round(2)
    )
    .reset_index()
    .pipe(
        lambda df: (
            px.area(
                df,
                x="date",
                y="fatality_rate",
                labels=dict(
                    date="Date",
                    fatality_rate="Fatality rate"
                )
            )
            .update_layout(
                title="Daily",
                yaxis=dict(ticksuffix="%"),
                margin={
                    "r": 0.7,
                    "l": 0,
                },
            )
        )
    )
)

Analysis of death time series¶

Subset deaths time series¶

global_daily_deaths = (
    hopkins_tidy_global_daily_df
    .select_columns(["date", "deaths"])
    .set_index("date")
)

(
    global_daily_deaths
    .plot()
    .update_layout(
        xaxis_title="Date",
        yaxis_title="Number of deaths",
        showlegend=False
    )
)

Find critical points in the behavior of the time series¶

Daily autocorrelation¶

series_corr_plot(global_daily_deaths, nlags=30, fft=False)

Check for tranformations and differentiation of data¶

adf_test = ADFTest(alpha=0.01)
print(adf_test.should_diff(global_daily_deaths))
print(adf_test.should_diff(global_daily_deaths.diff().dropna()))
(0.8857597080380056, True)
(0.01, False)
transform_serie_to_stationary = dict(
    no_transformation=lambda x: x,
    just_differencing=lambda x: x.diff().dropna(),
    differencing_with_log=lambda x: x.apply(np.log).diff().dropna(),
    differencing_with_sqrt=lambda x: x.apply(np.sqrt).diff().dropna(),
)

for transformer_type, transformer in transform_serie_to_stationary.items():
    print(transformer_type)
    print_adfuller(adfuller(transformer(global_daily_deaths)))
no_transformation

    ADF Statistic: -2.50020123595216
    p-value: 0.11542536955639882
    Critical values {'1%': -3.44152019959894, '5%': -2.8664679191981297, '10%': -2.569394451038919}
    
just_differencing

    ADF Statistic: -4.623950081121708
    p-value: 0.0001166578466486249
    Critical values {'1%': -3.4415393130846725, '5%': -2.866476335860869, '10%': -2.5693989358590006}
    
differencing_with_log

    ADF Statistic: -3.0638525397210126
    p-value: 0.029346507982297262
    Critical values {'1%': -3.4415393130846725, '5%': -2.866476335860869, '10%': -2.5693989358590006}
    
differencing_with_sqrt

    ADF Statistic: -4.0797118586982295
    p-value: 0.001046074219669163
    Critical values {'1%': -3.4415393130846725, '5%': -2.866476335860869, '10%': -2.5693989358590006}
    
global_differenced_deaths = global_daily_deaths.diff().dropna()

Show fist-order differencing¶

(
    global_differenced_deaths
    .pipe(
        lambda df: (
            df
            .join(
                df.rolling("7D").mean(),
                rsuffix="_mean"
            )
            .join(
                df.rolling("7D").std(),
                rsuffix="_std"
            )
        )
    )
    .reset_index()
    .pivot_longer(
        index="date",
        names_to="metric"
    )
    .assign(
        variable=lambda df: df.metric.str.extract(r"(confirmed|deaths)"),
        metric=lambda df: (
            df.metric.str.extract(r"(?:confirmed|deaths)_?(.+)")
            .fillna("Daily")
            .replace(
                {
                    "mean": "Rolling mean (7 days)",
                    "std": "Rolling std (7 days)"
                }
            )
        )
    )
    .pipe(
        lambda df: (
            px.line(
                df,
                x="date",
                y="value",
                color="metric",
                facet_row="variable",
                labels=dict(
                    date="Date",
                    value="Observations number",
                    metric="Trace",
                    variable="Variable"
                )
            )
            .update_yaxes(matches=None)
            .update_layout(
                legend=dict(
                    orientation="h",
                    yanchor="bottom",
                    y=1.02
                ),
                margin={
                    "r": 0.7,
                    "l": 0,
                },
                hovermode="x unified"
            )
        )
    )
)

Clarify seasonal parameters signal¶

series_corr_plot(global_differenced_deaths, nlags=30, fft=True)
series_corr_plot(global_differenced_deaths, nlags=30, plot_pacf=True)

Show seasonal differencing¶

(
    global_differenced_deaths
    .diff(8)
    .dropna()
    .pipe(
        lambda df: (
            df
            .join(
                df.rolling("7D").mean(),
                rsuffix="_mean"
            )
            .join(
                df.rolling("7D").std(),
                rsuffix="_std"
            )
        )
    )
    .reset_index()
    .pivot_longer(
        index="date",
        names_to="metric"
    )
   .assign(
        variable=lambda df: df.metric.str.extract(r"(confirmed|deaths)"),
        metric=lambda df: (
            df.metric.str.extract(r"(?:confirmed|deaths)_?(.+)")
            .fillna("Daily")
            .replace(
                {
                    "mean": "Rolling mean (7 days)",
                    "std": "Rolling std (7 days)"
                }
            )
        )
    )
    .pipe(
        lambda df: (
            px.line(
                df,
                x="date",
                y="value",
                color="metric",
                facet_row="variable",
                labels=dict(
                    date="Date",
                    value="Observations number",
                    metric="Trace",
                    variable="Variable"
                )
            )
            .update_yaxes(matches=None)
            .update_layout(
                legend=dict(
                    orientation="h",
                    yanchor="bottom",
                    y=1.02
                ),
                margin={
                    "r": 0.7,
                    "l": 0,
                },
                hovermode="x unified"
            )
        )
    )
)

Seasonal decomposition¶

decompose_res = seasonal_decompose(global_daily_deaths, period=7)

(
    pd.concat(
    [
        decompose_res.observed,
        decompose_res.trend,
        decompose_res.seasonal,
        decompose_res.resid
    ],
    axis=1
    )
    .rename_column(0, "observed")
    .reset_index()
    .pivot_longer(index="date")
    .pipe(
        lambda df: (
            px.line(
                df,
                x="date",
                y="value",
                color="variable",
                facet_row="variable",
                labels=dict(
                    date="Date",
                    value="Value"
                )
            )
            .update_yaxes(matches=None)
            .update_layout(showlegend=False, hovermode="x")
            .for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1].capitalize()))
        )
    )
    
)

Forecating¶

Utility functions¶

custom_auto_arima = partial(
    auto_arima,
    start_p=0,
    start_q=0,
    max_p=10,
    max_q=10,
    d=1, # Non-seasonal difference order
    seasonal=True, # Is the time series seasonal?
    m=7, # the seasonal period
    D=1, # seasonal difference order
    start_P=0,
    start_Q=0,
    max_P=3,
    max_Q=3,
    stepwise=True,
    error_action="ignore",
    suppress_warnings=True,
    information_criterion="aic" # used to select best model
)

def run_auto_arima(series: pd.Series, n_forecast: int = 21):
    arima_model = custom_auto_arima(series)

    new_dates_forecast = pd.date_range(
        start=series.index[-1] + pd.DateOffset(1),
        end=series.index[-1] + pd.DateOffset(n_forecast)
    )

    predictions, conf_interval = arima_model.predict(n_forecast, return_conf_int=True)
    lower_predicitons, upper_predictions = conf_interval[:, 0], conf_interval[:, 1]

    original_predicted_df = pd.DataFrame(
        dict(
            original=series,
            predicted=arima_model.predict_in_sample()
        )
    ).reset_index()

    forecast_df = pd.DataFrame(
        dict(
            date=new_dates_forecast,
            forecast=predictions,
            lower_bound=lower_predicitons,
            upper_bound=upper_predictions
        )
    )

    return arima_model, original_predicted_df, forecast_df

Find an ARIMA model for the data¶

arima_model, original_predicted_df, forecast_df = run_auto_arima(global_daily_deaths.deaths, n_forecast=21)
arima_model.summary()
SARIMAX Results
Dep. Variable: y No. Observations: 608
Model: SARIMAX(1, 1, 1)x(0, 1, 1, 7) Log Likelihood -5047.703
Date: Wed, 22 Sep 2021 AIC 10103.407
Time: 21:46:43 BIC 10120.994
Sample: 0 HQIC 10110.253
- 608
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ar.L1 0.0766 0.037 2.048 0.041 0.003 0.150
ma.L1 -0.7373 0.031 -23.795 0.000 -0.798 -0.677
ma.S.L7 -0.7273 0.023 -31.990 0.000 -0.772 -0.683
sigma2 1.182e+06 2.18e+04 54.293 0.000 1.14e+06 1.22e+06
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 20196.57
Prob(Q): 0.98 Prob(JB): 0.00
Heteroskedasticity (H): 5.74 Skew: 2.52
Prob(H) (two-sided): 0.00 Kurtosis: 30.97


Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

Mean Absolute Error (MAE)¶

mae = np.mean(np.abs(arima_model.resid()))
mae
664.543440636105

Model diagnostics¶

arima_model.plot_diagnostics();
_images/04-global-time-series_50_0.png

Plot model results¶

(
    original_predicted_df
    .set_index("date")
    .plot()
    .add_scatter(
        name="Forecast",
        x=forecast_df.date,
        y=forecast_df.forecast,
        mode="lines",
    )
    .add_scatter(
        name="Upper Bound",
        x=forecast_df.date,
        y=forecast_df.upper_bound,
        mode="lines",
        marker=dict(color="#444"),
        line=dict(width=0),
        showlegend=False
    )
    .add_scatter(
        name="Lower Bound",
        x=forecast_df.date,
        y=forecast_df.lower_bound,
        marker=dict(color="#444"),
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(68, 68, 68, 0.3)',
        fill='tonexty',
        showlegend=False
    )
    .update_layout(
        xaxis_title="Date",
        yaxis_title="Number of deaths",
        hovermode="x unified",
        legend=dict(
            title="Variable",
            orientation="h",
            yanchor="bottom",
            y=1.02
        ),
        margin={
            "r": 0.7,
            "l": 0,
        },
    )
)

Save model¶

model_path = output_dir.joinpath("covid_death_time_series.pkl")
joblib.dump(arima_model, model_path);

Expected deaths statistics in the next three weeks¶

forecast_df.remove_columns("date").describe()
forecast lower_bound upper_bound
count 21.000000 21.000000 21.000000
mean 7927.408728 4717.210682 11137.606774
std 1513.058581 1838.019864 1494.501887
min 4969.781695 804.802408 7975.225435
25% 6935.270707 3487.851842 10261.268436
50% 8339.623117 5145.518542 11451.981013
75% 9165.462677 6267.372851 12277.438233
max 9822.320090 7572.520060 13138.729563
print(original_predicted_df.tail(21).original.sum())
print(original_predicted_df.tail(7).original.sum() * 3)
188757.0
178083.0
model_text_res = f"""Predicted number of deaths:
Actually predicted: {forecast_df.forecast.sum().round()}
Lowest: {forecast_df.lower_bound.sum().round()}
Highest: {forecast_df.upper_bound.sum().round()}
"""

print(model_text_res)
Predicted number of deaths:
Actually predicted: 166476.0
Lowest: 99061.0
Highest: 233890.0

Vaccination¶

Global vaccinations time series¶

global_vaccination_cumulative_df = (
    vaccination_tidy_cumulative_df
    .groupby("date")
    .sum()
)

(
    global_vaccination_cumulative_df
    .reset_index()
    .pivot_longer(
        index="date"
    )
    .assign(
        variable=lambda df: (
            df
            .variable
            .replace(
                dict(
                    doses_admin="Doses administraded",
                    people_partially_vaccinated="People partially vaccinated",
                    people_fully_vaccinated="People fully vaccinated"
                )
            )
        )
    )
    .pipe(
        lambda df: (
            px.line(
                df,
                x="date",
                y="value",
                color="variable",
                labels=dict(
                    date="Date",
                    value="Value",
                    variable="Variable"
                )
            )
            .update_layout(
                hovermode="x unified",
                legend=dict(
                    orientation="h",
                    yanchor="bottom",
                    y=1.02
                )
            )
        )
    )
)